#create dataframe from lists
import pandas as pd
letters = ['a','b','c','d']#some letters
numbers = [1,2,3,4]#some numbers
lists_df = pd.DataFrame(data={'letters':letters,
'numbers':numbers})#data is a dictionary of lists
display(lists_df)
| letters | numbers | |
|---|---|---|
| 0 | a | 1 |
| 1 | b | 2 |
| 2 | c | 3 |
| 3 | d | 4 |
#create dataframe from numpy array
import numpy as np
numpy_data = np.array([1,2,3,4,5,6,7,8]).reshape(4,2) #a 4x2 array
numpy_df = pd.DataFrame(data=numpy_data,columns = ['column_1','column_2'])#data is a numpy array
display(numpy_df)
| column_1 | column_2 | |
|---|---|---|
| 0 | 1 | 2 |
| 1 | 3 | 4 |
| 2 | 5 | 6 |
| 3 | 7 | 8 |
#create dataframe from records, or a list of rows
records = [('a',1),('b',2),('c',3),('d',4)]#a list of tuples
records_df = pd.DataFrame.from_records(records,
columns = ['letters','numbers'])#data is a list of records
display(records_df)
| letters | numbers | |
|---|---|---|
| 0 | a | 1 |
| 1 | b | 2 |
| 2 | c | 3 |
| 3 | d | 4 |
#we will use this dataframe for the next few examples
ex_df = pd.DataFrame.from_records(records,
columns = ['letters','numbers'],
index = ['first','second','third','fourth'])#add an index
display(ex_df)
| letters | numbers | |
|---|---|---|
| first | a | 1 |
| second | b | 2 |
| third | c | 3 |
| fourth | d | 4 |
# .loc is the primary way to locate both rows and columns
display(ex_df.loc[:,['letters']]) #dataframe
| letters | |
|---|---|
| first | a |
| second | b |
| third | c |
| fourth | d |
display(ex_df.loc[:,'letters']) #series
first a second b third c fourth d Name: letters, dtype: object
display(ex_df.loc[:,[col for col in ex_df.columns if 'l' in col]])#select columns based on column name
| letters | |
|---|---|
| first | a |
| second | b |
| third | c |
| fourth | d |
display(ex_df.loc[['first','third'],:])#dataframe
| letters | numbers | |
|---|---|---|
| first | a | 1 |
| third | c | 3 |
display(ex_df.loc['first',:])#series
letters a numbers 1 Name: first, dtype: object
display(ex_df.loc[ex_df['numbers']<=2,:])#filter rows based on number column
| letters | numbers | |
|---|---|---|
| first | a | 1 |
| second | b | 2 |
display(ex_df.loc[ex_df.numbers<=2,:])#same thing, different way of specifying series
| letters | numbers | |
|---|---|---|
| first | a | 1 |
| second | b | 2 |
#apply functions
def times_two(num):
return 2*num
def to_upper(lett):
return lett.upper()
transformed_df = ex_df.apply({'letters':to_upper,'numbers':times_two})
display_side_by_side(ex_df,transformed_df,titles = ['Original','Transformed'])
Original
| Transformed
|
#create two tables to merge
ex_df2 = ex_df.apply({'numbers':times_two,'letters':lambda x:x}).rename(columns={'numbers':'2x_numbers'})
display_side_by_side(ex_df,ex_df2,titles = ['Original','Transformed'])
Original
| Transformed
|
#merge on the 'letters' column
merged_df = ex_df.merge(ex_df2,on='letters')
display_side_by_side(ex_df,ex_df2,merged_df,titles = ['Left','Right','Merged'])
Left
| Right
| Merged
|
#join operates on indexes instead of columns
joined_df = ex_df.join(ex_df2,lsuffix='_l',rsuffix='_r')
display_side_by_side(ex_df,ex_df2,joined_df, titles = ['Left','Right','Joined'])
Left
| Right
| Joined
|
#can specify different left and right merge columns
merged_df = ex_df.merge(ex_df2,left_on='letters',right_on='letters')
display_side_by_side(ex_df,ex_df2,merged_df, titles = ['Left','Right','Merged'])
Left
| Right
| Merged
|
#Default is 'inner' join, so unmatched keys are dropped
merged_df = ex_df.merge(ex_df2,left_on='numbers',right_on='2x_numbers')
display_side_by_side(ex_df,ex_df2,merged_df,titles = ['Left','Right','Merged'])
Left
| Right
| Merged
|
#You can specify the suffixes that appear when columns get duplicated
merged_df = ex_df.merge(ex_df2,left_on='numbers',right_on='2x_numbers',suffixes=('_l','_r'))
display_side_by_side(ex_df,ex_df2,merged_df,titles = ['Left','Right','Merged'])
Left
| Right
| Merged
|
#Specifying 'right' join causes *all* rows in the right dataframe to appear,
#and fill in na's if they are unmatched by the left dataframe
merged_df = ex_df.merge(ex_df2,left_on='numbers',right_on='2x_numbers',suffixes=('_l','_r'),how='right')
display_side_by_side(ex_df,ex_df2,merged_df, titles = ['Left','Right','Merged'])
Left
| Right
| Merged
|
#Concatenation is more like a union
ex_df2 = ex_df.apply({'numbers':times_two,'letters':lambda x:x})
concatenated_df = pd.concat([ex_df,ex_df2],axis=0,ignore_index=True)
display_side_by_side(ex_df,ex_df2,concatenated_df,titles = ['Left','Right','Concatenated'])
Left
| Right
| Concatenated
|
#concatenation along rows
concatenated_df = pd.concat([ex_df,ex_df2],axis=1)
display_side_by_side(ex_df,ex_df2,concatenated_df,titles = ['Left','Right','Concatenated'])
Left
| Right
| Concatenated
|
#what happens when you duplicated columns
display(concatenated_df['letters'])
| letters | letters | |
|---|---|---|
| first | a | a |
| second | b | b |
| third | c | c |
| fourth | d | d |
#create plots
#from dataframe
import matplotlib.pyplot as plt
ex_df.set_index('letters').plot(kind='bar')
plt.show()
#with plotly
import plotly.express as px
fig = px.bar(ex_df,
x = 'letters',
y = 'numbers')
fig.update_layout(width=600,height=400)
fig.show()
#create summary tables
summary_df = ex_df.describe()
display_side_by_side(ex_df,summary_df,titles = ['Original','Summary'])
Original
| Summary
|
#column summaries
summary_df = ex_df.agg({'numbers':sum,'letters':min})
display(ex_df,summary_df)
| letters | numbers | |
|---|---|---|
| first | a | 1 |
| second | b | 2 |
| third | c | 3 |
| fourth | d | 4 |
numbers 10 letters a dtype: object
#groupby
ex_group_df = pd.concat([ex_df,
ex_df.shift(1).bfill().rename(index = dict((k,k+'_1') for k in ex_df.index)),
ex_df.shift(-1).ffill().rename(index = dict((k,k+'_-1') for k in ex_df.index)),
ex_df.shift(2).fillna(0).rename(index = dict((k,k+'_2') for k in ex_df.index))])
display(ex_group_df)
| letters | numbers | |
|---|---|---|
| first | a | 1.0 |
| second | b | 2.0 |
| third | c | 3.0 |
| fourth | d | 4.0 |
| first_1 | a | 1.0 |
| second_1 | a | 1.0 |
| third_1 | b | 2.0 |
| fourth_1 | c | 3.0 |
| first_-1 | b | 2.0 |
| second_-1 | c | 3.0 |
| third_-1 | d | 4.0 |
| fourth_-1 | d | 4.0 |
| first_2 | 0 | 0.0 |
| second_2 | 0 | 0.0 |
| third_2 | a | 1.0 |
| fourth_2 | b | 2.0 |
grouped_df = ex_group_df.groupby('letters').sum()#sum the numbers that correspond to a particular letter
sorted_ex_group_df = ex_group_df.astype({'letters':'string'}).sort_values('letters')#make all entries strings and sort
display_side_by_side(sorted_ex_group_df,grouped_df,titles = ['Original','Grouped'])
Original
| Grouped
|
#multiple aggregations
grouped_df = ex_group_df.groupby('letters').agg({'numbers':[min,max,sum,'count']})#create different aggregations for the same column
sorted_ex_group_df = ex_group_df.astype({'letters':'string'}).sort_values('letters')
display_side_by_side(sorted_ex_group_df,grouped_df,titles = ['Original','Multiple Aggregations '])
Original
| Multiple Aggregations
| |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
#Indices and columns can have arbitrary number of levels
grouped_df = ex_group_df.groupby('letters').agg({'numbers':[min,max,sum,'count']})
print('Hierarchical: ',grouped_df.columns)
grouped_df = grouped_df.reset_index()
grouped_df2 = grouped_df.copy()#this is important
grouped_df2.columns = grouped_df2.columns.droplevel()
print('Flat: ',grouped_df2.columns)
grouped_df2 = grouped_df2.reset_index().rename(columns={'':'letters'})
display_side_by_side(grouped_df,grouped_df2,titles = ['Hierarchical','Flat'])
Hierarchical: MultiIndex([('numbers', 'min'),
('numbers', 'max'),
('numbers', 'sum'),
('numbers', 'count')],
)
Flat: Index(['', 'min', 'max', 'sum', 'count'], dtype='object')
Hierarchical
| Flat
| ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
#melt a dataframe, make several columns into two
melted_df = grouped_df2.melt(id_vars = 'letters',
value_vars = ['min','max','sum','count'],
var_name = 'aggregation',
value_name = 'value')
#pivot dataframe, make two or more columns into many
pivot_df = melted_df.pivot(index = 'letters',
columns = 'aggregation',
values = 'value')
display_side_by_side(grouped_df2,melted_df,pivot_df,titles = ['Original','Melted','Pivoted'])
Original
| Melted
| Pivoted
|
I don't intend to find any manipulation in this data set or make any 'grand conclusions,' but hopefully you'll see some useful things you can do with pandas and get a neat view of an important week in the equities market.
The Investor's Exchange is one of the 15 or so independent stock exchanges that operate in the U.S. It was featured in the Michael Lewis book Flash Boys. They do their best to offer access to the market with protections in place that favor long-term over short-term strategies. They're pretty small compared to NYSE or NASDAQ, but they allow you to download pretty detailed historical data for free, even if it is a dump of network packets.
Every time a broker wants to make a trade, they put an an order (I want to buy 200 shares of stock A for $100 each). Until the order is canceled, or a seller is found that is willing to meet the buyer at their price, the order 'rests on the book.' Most heavily traded stocks do not have orders that rest long, unless the number of shares is exceptionally large or the price point is completely unreasonable.
| time | side | quantity | price |
|---|---|---|---|
| 9:30 | B | 100 | 45.00 |
| 9:31 | B | 200 | 45.01 |
| 9:31 | S | 100 | 45.05 |
| 9:32 | B | 50 | 45.00 |
This is relatively subjective, but the crux of many types of manipulation is 'are you engaging with the market in good faith?' In other words, are you placing orders onto the books because you would like to buy a stock at the price you stated (or at least hope someone will sell to you at that price), or are you sending false signals to manipulate at what prices people are willing to buy/sell.
| time | side | quantity | price | firm |
|---|---|---|---|---|
| 9:30 | B | 100 | 45.00 | A |
| 9:31 | B | 200 | 45.01 | A |
| 9:31 | S | 100 | 45.05 | B |
| 9:32 | B | 50 | 45.00 | A |
| 9:33 | S | 100 | 45.04 | Manip |
| 9:33 | S | 200 | 45.04 | Manip |
| 9:33 | S | 500 | 45.03 | Manip |
| 9:33 | S | 100 | 45.05 | Manip |
| 9:33 | S | 400 | 45.03 | Manip |
| 9:33 | S | 100 | 45.03 | Manip |
| 9:34 | S | 600 | 45.02 | IllBite |
| 9:34 | B | 600 | 45.02 | Manip |
Reg NMS is an SEC rule that created the 'National Market System.' Stock exchanges became publicly traded companies and firms routing orders to the markets gained the obligation of 'best execution.' This generally means that if NYSE's order book has a stock selling at 45.05 and NASDAQ's order book has the same stock selling at 45.04, your buy order with Charles Schwab account needs to take the shares at NASDAQ at the better price.
The regulation is intended to keep brokerages from giving customers a bad deal, but it also created a complicated set of rules that can be gamed.
Short answer: Regulators get to see much more detailed information than everyone else. It can be pretty difficult even with the extra information.
manip_df = pd.read_csv('example_manip.csv',
sep=',',
header=1)
display(manip_df)
| Time | Firm | Side | Price | Quantity | Order ID | Action | NBO | NBB | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 10/13/2021 9:31:00.0 AM | Firm S | S | 50.05 | 1000 | 1 | Order | 50.05 | 49.95 |
| 1 | 10/13/2021 9:31:00.0 AM | Firm B | B | 49.95 | 1000 | 2 | Order | 50.05 | 49.95 |
| 2 | 10/13/2021 9:31:00.2 AM | Firm M | S | 50.04 | 200 | 3 | Order | 50.04 | 49.95 |
| 3 | 10/13/2021 9:31:01.5 AM | Firm M | S | 50.03 | 100 | 4 | Order | 50.03 | 49.95 |
| 4 | 10/13/2021 9:31:03.0 AM | Firm M | S | 50.03 | 100 | 4 | Cancel | 50.04 | 49.95 |
| 5 | 10/13/2021 9:31:03.2 AM | Firm M | S | 50.01 | 150 | 5 | Order | 50.01 | 49.95 |
| 6 | 10/13/2021 9:31:03.3 AM | Firm M | S | 50.00 | 300 | 6 | Order | 50.00 | 49.95 |
| 7 | 10/13/2021 9:31:04.5 AM | Firm M | S | 50.03 | 500 | 7 | Order | 50.00 | 49.95 |
| 8 | 10/13/2021 9:31:04.6 AM | Firm M | S | 50.04 | 200 | 8 | Order | 50.00 | 49.95 |
| 9 | 10/13/2021 9:31:05.7 AM | Firm M | S | 49.96 | 200 | 9 | Order | 49.96 | 49.95 |
| 10 | 10/13/2021 9:31:05.7 AM | Firm M | S | 50.04 | 100 | 8 | Cancel | 49.96 | 49.95 |
| 11 | 10/13/2021 9:31:05.5 AM | Firm M | S | 49.97 | 600 | 10 | Order | 49.96 | 49.95 |
| 12 | 10/13/2021 9:31:05.6 AM | Firm M | S | 49.96 | 300 | 11 | Order | 49.96 | 49.95 |
| 13 | 10/13/2021 9:31:06.8 AM | Firm S | S | 49.96 | 1000 | 12 | Order | 49.96 | 49.95 |
| 14 | 10/13/2021 9:31:06.9 AM | Firm M | B | 49.96 | 1000 | 13 | Order | 49.96 | 49.96 |
| 15 | 10/13/2021 9:31:06.9 AM | Firm M | X | 49.96 | 1000 | 12_13 | Trade | 49.96 | 49.95 |
| 16 | 10/13/2021 9:31:07.0 AM | Firm M | S | 49.96 | 300 | 11 | Cancel | 49.97 | 49.95 |
| 17 | 10/13/2021 9:31:07.0 AM | Firm M | S | 49.97 | 600 | 10 | Cancel | 50.00 | 49.95 |
| 18 | 10/13/2021 9:31:07.0 AM | Firm M | S | 49.96 | 200 | 9 | Cancel | 50.00 | 49.95 |
| 19 | 10/13/2021 9:31:07.0 AM | Firm M | S | 50.04 | 100 | 8 | Cancel | 50.00 | 49.95 |
| 20 | 10/13/2021 9:31:07.0 AM | Firm M | S | 50.03 | 500 | 7 | Cancel | 50.00 | 49.95 |
| 21 | 10/13/2021 9:31:07.0 AM | Firm M | S | 50.00 | 300 | 6 | Cancel | 50.00 | 49.95 |
| 22 | 10/13/2021 9:31:07.0 AM | Firm M | S | 50.01 | 150 | 5 | Cancel | 50.04 | 49.95 |
| 23 | 10/13/2021 9:31:07.0 AM | Firm M | S | 50.04 | 200 | 3 | Cancel | 50.05 | 49.95 |
def string_to_timestamp(string):
timestamp_components = string.split(' ')
timestamp_string = 'T'.join(timestamp_components[:2])
return pd.to_datetime(timestamp_string)
manip_df['timestamp'] = manip_df['Time'].apply(string_to_timestamp)
manip_df
| Time | Firm | Side | Price | Quantity | Order ID | Action | NBO | NBB | timestamp | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 10/13/2021 9:31:00.0 AM | Firm S | S | 50.05 | 1000 | 1 | Order | 50.05 | 49.95 | 2021-10-13 09:31:00.000 |
| 1 | 10/13/2021 9:31:00.0 AM | Firm B | B | 49.95 | 1000 | 2 | Order | 50.05 | 49.95 | 2021-10-13 09:31:00.000 |
| 2 | 10/13/2021 9:31:00.2 AM | Firm M | S | 50.04 | 200 | 3 | Order | 50.04 | 49.95 | 2021-10-13 09:31:00.200 |
| 3 | 10/13/2021 9:31:01.5 AM | Firm M | S | 50.03 | 100 | 4 | Order | 50.03 | 49.95 | 2021-10-13 09:31:01.500 |
| 4 | 10/13/2021 9:31:03.0 AM | Firm M | S | 50.03 | 100 | 4 | Cancel | 50.04 | 49.95 | 2021-10-13 09:31:03.000 |
| 5 | 10/13/2021 9:31:03.2 AM | Firm M | S | 50.01 | 150 | 5 | Order | 50.01 | 49.95 | 2021-10-13 09:31:03.200 |
| 6 | 10/13/2021 9:31:03.3 AM | Firm M | S | 50.00 | 300 | 6 | Order | 50.00 | 49.95 | 2021-10-13 09:31:03.300 |
| 7 | 10/13/2021 9:31:04.5 AM | Firm M | S | 50.03 | 500 | 7 | Order | 50.00 | 49.95 | 2021-10-13 09:31:04.500 |
| 8 | 10/13/2021 9:31:04.6 AM | Firm M | S | 50.04 | 200 | 8 | Order | 50.00 | 49.95 | 2021-10-13 09:31:04.600 |
| 9 | 10/13/2021 9:31:05.7 AM | Firm M | S | 49.96 | 200 | 9 | Order | 49.96 | 49.95 | 2021-10-13 09:31:05.700 |
| 10 | 10/13/2021 9:31:05.7 AM | Firm M | S | 50.04 | 100 | 8 | Cancel | 49.96 | 49.95 | 2021-10-13 09:31:05.700 |
| 11 | 10/13/2021 9:31:05.5 AM | Firm M | S | 49.97 | 600 | 10 | Order | 49.96 | 49.95 | 2021-10-13 09:31:05.500 |
| 12 | 10/13/2021 9:31:05.6 AM | Firm M | S | 49.96 | 300 | 11 | Order | 49.96 | 49.95 | 2021-10-13 09:31:05.600 |
| 13 | 10/13/2021 9:31:06.8 AM | Firm S | S | 49.96 | 1000 | 12 | Order | 49.96 | 49.95 | 2021-10-13 09:31:06.800 |
| 14 | 10/13/2021 9:31:06.9 AM | Firm M | B | 49.96 | 1000 | 13 | Order | 49.96 | 49.96 | 2021-10-13 09:31:06.900 |
| 15 | 10/13/2021 9:31:06.9 AM | Firm M | X | 49.96 | 1000 | 12_13 | Trade | 49.96 | 49.95 | 2021-10-13 09:31:06.900 |
| 16 | 10/13/2021 9:31:07.0 AM | Firm M | S | 49.96 | 300 | 11 | Cancel | 49.97 | 49.95 | 2021-10-13 09:31:07.000 |
| 17 | 10/13/2021 9:31:07.0 AM | Firm M | S | 49.97 | 600 | 10 | Cancel | 50.00 | 49.95 | 2021-10-13 09:31:07.000 |
| 18 | 10/13/2021 9:31:07.0 AM | Firm M | S | 49.96 | 200 | 9 | Cancel | 50.00 | 49.95 | 2021-10-13 09:31:07.000 |
| 19 | 10/13/2021 9:31:07.0 AM | Firm M | S | 50.04 | 100 | 8 | Cancel | 50.00 | 49.95 | 2021-10-13 09:31:07.000 |
| 20 | 10/13/2021 9:31:07.0 AM | Firm M | S | 50.03 | 500 | 7 | Cancel | 50.00 | 49.95 | 2021-10-13 09:31:07.000 |
| 21 | 10/13/2021 9:31:07.0 AM | Firm M | S | 50.00 | 300 | 6 | Cancel | 50.00 | 49.95 | 2021-10-13 09:31:07.000 |
| 22 | 10/13/2021 9:31:07.0 AM | Firm M | S | 50.01 | 150 | 5 | Cancel | 50.04 | 49.95 | 2021-10-13 09:31:07.000 |
| 23 | 10/13/2021 9:31:07.0 AM | Firm M | S | 50.04 | 200 | 3 | Cancel | 50.05 | 49.95 | 2021-10-13 09:31:07.000 |
import plotly.express as px
import plotly.graph_objects as go
fig = px.scatter(manip_df,
x='timestamp',
y='Price',
symbol='Action',
color='Firm',
size='Quantity',
color_discrete_sequence=['goldenrod','aqua','lightgreen'],
opacity=.5)
fig.add_trace(go.Scatter(x=manip_df['timestamp'],
y=manip_df['NBO'],
mode='lines',
line_color='magenta',
line_shape='hv',
name='Best Bid'))
fig.add_trace(go.Scatter(x=manip_df['timestamp'],
y=manip_df['NBB'],
mode='lines',
line_color='cyan',
line_shape='hv',
name='Best Offer'))
fig.update_layout(template='plotly_dark');
from IPython.display import Image
Image(fig.to_image(format="png"))
!xxd -l 112 -s 20000 ~/Downloads/data_feeds_20210122_20210122_IEXTP1_DEEP1.0.pcap
00004e20: 7cb9 0500 ed64 2f58 e805 0000 e805 0000 |....d/X........ 00004e30: 0100 5e57 1504 b859 9ff9 2d53 0800 4500 ..^W...Y..-S..E. 00004e40: 05da e340 4000 4011 9f90 17e2 9b84 e9d7 ...@@.@......... 00004e50: 1504 288a 288a 05c6 0402 0100 0480 0100 ..(.(........... 00004e60: 0000 0000 d948 9605 4200 fc42 0000 0000 .....H..B..B.... 00004e70: 0000 1903 0000 0000 0000 f76d ff74 b88d ...........m.t.. 00004e80: 5c16 1600 4854 c51f ff74 b88d 5c16 4149 \...HT...t..\.AI
#A lovely command line conversion of the binary to a json file
#!pip install iex_parser
#!iex_to_json -i iex_file.pcap.gz -o iex_file.json.gz -t 'GME' -s
import json
with open('iex_deep_quotes_and_trades.json') as file:
line = file.readline()
print(line[:500])
[{"type":"trade_report","event":null,"timestamp":"2021-01-22T13:02:31.215300+00:00","status":null,"symbol":"GME","detail":null,"halt_status":null,"reason":null,"flags":96.0,"size":39.0,"price":45.19,"trade_id":2067095.0,"side":null,"security_event":null},{"type":"trade_report","event":null,"timestamp":"2021-01-22T13:08:14.700160+00:00","status":null,"symbol":"GME","detail":null,"halt_status":null,"reason":null,"flags":96.0,"size":50.0,"price":44.87,"trade_id":2639914.0,"side":null,"security_even
#Read in json file, many json files that have a schema without
#a lot of nesting and variation can be read safely as records
from pandas.io.json import read_json
json_df = read_json('iex_deep_quotes_and_trades.json',orient='records')
display(json_df)
| type | event | timestamp | status | symbol | detail | halt_status | reason | flags | size | price | trade_id | side | security_event | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | trade_report | NaN | 2021-01-22 13:02:31.215300+00:00 | NaN | GME | NaN | NaN | NaN | 96 | 39 | 45.19 | 2.067095e+06 | None | NaN |
| 1 | trade_report | NaN | 2021-01-22 13:08:14.700160+00:00 | NaN | GME | NaN | NaN | NaN | 96 | 50 | 44.87 | 2.639914e+06 | None | NaN |
| 2 | trade_report | NaN | 2021-01-22 13:11:52.294756+00:00 | NaN | GME | NaN | NaN | NaN | 96 | 50 | 44.58 | 3.063945e+06 | None | NaN |
| 3 | trade_report | NaN | 2021-01-22 13:18:22.383301+00:00 | NaN | GME | NaN | NaN | NaN | 96 | 50 | 44.04 | 3.669247e+06 | None | NaN |
| 4 | trade_report | NaN | 2021-01-22 13:19:09.002873+00:00 | NaN | GME | NaN | NaN | NaN | 96 | 10 | 43.78 | 3.739332e+06 | None | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 490763 | price_level_update | NaN | 2021-01-28 21:59:06.793899+00:00 | NaN | GME | NaN | NaN | NaN | 1 | 0 | 262.00 | NaN | S | NaN |
| 490764 | price_level_update | NaN | 2021-01-28 21:59:06.797557+00:00 | NaN | GME | NaN | NaN | NaN | 1 | 100 | 262.00 | NaN | S | NaN |
| 490765 | trade_report | NaN | 2021-01-28 21:59:08.241677+00:00 | NaN | GME | NaN | NaN | NaN | 96 | 19 | 262.00 | 3.027362e+09 | None | NaN |
| 490766 | price_level_update | NaN | 2021-01-28 21:59:08.241677+00:00 | NaN | GME | NaN | NaN | NaN | 1 | 0 | 262.00 | NaN | S | NaN |
| 490767 | price_level_update | NaN | 2021-01-28 22:00:00.018339+00:00 | NaN | GME | NaN | NaN | NaN | 1 | 0 | 988.00 | NaN | S | NaN |
490768 rows × 14 columns
#see what the na situation is
json_df.isna().sum()
type 0 event 490768 timestamp 0 status 490768 symbol 0 detail 490768 halt_status 490768 reason 490768 flags 0 size 0 price 0 trade_id 356056 side 134712 security_event 490768 dtype: int64
#get rid of columns that are entirely null
json_df = json_df.dropna(axis = 1,how='all')
display(json_df)
| type | timestamp | symbol | flags | size | price | trade_id | side | |
|---|---|---|---|---|---|---|---|---|
| 0 | trade_report | 2021-01-22 13:02:31.215300+00:00 | GME | 96 | 39 | 45.19 | 2.067095e+06 | None |
| 1 | trade_report | 2021-01-22 13:08:14.700160+00:00 | GME | 96 | 50 | 44.87 | 2.639914e+06 | None |
| 2 | trade_report | 2021-01-22 13:11:52.294756+00:00 | GME | 96 | 50 | 44.58 | 3.063945e+06 | None |
| 3 | trade_report | 2021-01-22 13:18:22.383301+00:00 | GME | 96 | 50 | 44.04 | 3.669247e+06 | None |
| 4 | trade_report | 2021-01-22 13:19:09.002873+00:00 | GME | 96 | 10 | 43.78 | 3.739332e+06 | None |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 490763 | price_level_update | 2021-01-28 21:59:06.793899+00:00 | GME | 1 | 0 | 262.00 | NaN | S |
| 490764 | price_level_update | 2021-01-28 21:59:06.797557+00:00 | GME | 1 | 100 | 262.00 | NaN | S |
| 490765 | trade_report | 2021-01-28 21:59:08.241677+00:00 | GME | 96 | 19 | 262.00 | 3.027362e+09 | None |
| 490766 | price_level_update | 2021-01-28 21:59:08.241677+00:00 | GME | 1 | 0 | 262.00 | NaN | S |
| 490767 | price_level_update | 2021-01-28 22:00:00.018339+00:00 | GME | 1 | 0 | 988.00 | NaN | S |
490768 rows × 8 columns
#What are we left with?
json_df.isna().sum()
type 0 timestamp 0 symbol 0 flags 0 size 0 price 0 trade_id 356056 side 134712 dtype: int64
#What data types are we working with?
json_df.dtypes
type object timestamp datetime64[ns, UTC] symbol object flags int64 size int64 price float64 trade_id float64 side object dtype: object
#The objects really should be strings
json_df = json_df.astype({'type':'string','symbol':'string','side':'string'})
json_df.dtypes
type string timestamp datetime64[ns, UTC] symbol string flags int64 size int64 price float64 trade_id float64 side string dtype: object
#Fill in nulls on the side, since that may cause trouble plotting trades
#create a date column for filtering purposes, and change from UTC to EST
from datetime import timezone
json_df = json_df.fillna({'side':'X'})#replace nulls
json_df['date'] = json_df.apply({'timestamp':lambda x: x.date})#a column with just the date can be useful
json_df['timestamp'] = json_df['timestamp'].apply(lambda x: x.astimezone(tz='EST').replace(tzinfo=None))#change to local time
display(json_df)
| type | timestamp | symbol | flags | size | price | trade_id | side | date | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | trade_report | 2021-01-22 08:02:31.215300 | GME | 96 | 39 | 45.19 | 2.067095e+06 | X | 2021-01-22 |
| 1 | trade_report | 2021-01-22 08:08:14.700160 | GME | 96 | 50 | 44.87 | 2.639914e+06 | X | 2021-01-22 |
| 2 | trade_report | 2021-01-22 08:11:52.294756 | GME | 96 | 50 | 44.58 | 3.063945e+06 | X | 2021-01-22 |
| 3 | trade_report | 2021-01-22 08:18:22.383301 | GME | 96 | 50 | 44.04 | 3.669247e+06 | X | 2021-01-22 |
| 4 | trade_report | 2021-01-22 08:19:09.002873 | GME | 96 | 10 | 43.78 | 3.739332e+06 | X | 2021-01-22 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 490763 | price_level_update | 2021-01-28 16:59:06.793899 | GME | 1 | 0 | 262.00 | NaN | S | 2021-01-28 |
| 490764 | price_level_update | 2021-01-28 16:59:06.797557 | GME | 1 | 100 | 262.00 | NaN | S | 2021-01-28 |
| 490765 | trade_report | 2021-01-28 16:59:08.241677 | GME | 96 | 19 | 262.00 | 3.027362e+09 | X | 2021-01-28 |
| 490766 | price_level_update | 2021-01-28 16:59:08.241677 | GME | 1 | 0 | 262.00 | NaN | S | 2021-01-28 |
| 490767 | price_level_update | 2021-01-28 17:00:00.018339 | GME | 1 | 0 | 988.00 | NaN | S | 2021-01-28 |
490768 rows × 9 columns
#Create subset that is just the order book updates
mask = json_df['type']=='price_level_update'
select_cols = ['timestamp','size','price','side']
sort_cols = ['timestamp','price']
order_df = json_df.loc[mask,select_cols].sort_values(sort_cols)
class PriceLevels(dict):
def ignore_item(self,item):
return self
def add_or_discard(self,size,price,side,quote_side):
if (size > 0)&(side==quote_side):
self.update({price:size})
elif (size == 0)&(side==quote_side):
self.pop(price)
else:
self.ignore_item
return self
def get_bbo(self,side):
if (side == 'B')&(len(self)>0):
return max(self.keys())
elif (side == 'S')&(len(self)>0):
return min(self.keys())
else:
return None
def get_vwap(self):
if len(self)==0:
return None
volume = self
return sum([k*v for k,v in self.items()])/sum([v for v in self.values()])
def update_prices(self,size,price,side,quote_side):
self.add_or_discard(size,price,side,quote_side)
return PriceLevels(self.copy())
a = PriceLevels({45.45:100,50:100,55:50})
print('set price levels: ',a)
a.update_prices(100,46.05,'B','B')
print('update price level same side: ',a)
a.update_prices(100,46.10,'S','B')
print('update price level opposite side: ',a)
a.update_prices(0,50,'B','B')
print('remove price same side: ',a)
print('get the best bid: ',a.get_bbo('B'))
print('get vwap: ',a.get_vwap())
set price levels: {45.45: 100, 50: 100, 55: 50}
update price level same side: {45.45: 100, 50: 100, 55: 50, 46.05: 100}
update price level opposite side: {45.45: 100, 50: 100, 55: 50, 46.05: 100}
remove price same side: {45.45: 100, 55: 50, 46.05: 100}
get the best bid: 55
get vwap: 47.6
#For each timestamp, find current sizes of each available price, the best bid and offer,
#as well as the VWAP (value weighted average price) or the buy orders, sell orders, and all orders
bid = PriceLevels()
ofr = PriceLevels()
quotes = dict()
#Use iterrows()
for row in order_df.iterrows():
timestamp,size,price,side = row[1]
quotes[timestamp] = {'bid':bid.update_prices(size,price,side,'B'),
'ofr':ofr.update_prices(size,price,side,'S'),
'best_bid':bid.get_bbo('B'),
'bid_vwap':bid.get_vwap(),
'best_ofr':ofr.get_bbo('S'),
'ofr_vwap':ofr.get_vwap(),
'avg_vwap':PriceLevels({**bid,**ofr}).get_vwap(),
'date':timestamp.date()}
list(quotes.items())[500:504]
[(Timestamp('2021-01-22 09:31:48.205629'),
{'bid': {24.98: 100},
'ofr': {43.61: 300},
'best_bid': 24.98,
'bid_vwap': 24.98,
'best_ofr': 43.61,
'ofr_vwap': 43.61,
'avg_vwap': 38.9525,
'date': datetime.date(2021, 1, 22)}),
(Timestamp('2021-01-22 09:31:48.320642'),
{'bid': {24.98: 100},
'ofr': {43.61: 300, 43.59: 300},
'best_bid': 24.98,
'bid_vwap': 24.98,
'best_ofr': 43.59,
'ofr_vwap': 43.6,
'avg_vwap': 40.94,
'date': datetime.date(2021, 1, 22)}),
(Timestamp('2021-01-22 09:31:48.320661'),
{'bid': {24.98: 100},
'ofr': {43.59: 300},
'best_bid': 24.98,
'bid_vwap': 24.98,
'best_ofr': 43.59,
'ofr_vwap': 43.59,
'avg_vwap': 38.93750000000001,
'date': datetime.date(2021, 1, 22)}),
(Timestamp('2021-01-22 09:31:48.325218'),
{'bid': {24.98: 100},
'ofr': {43.6: 300},
'best_bid': 24.98,
'bid_vwap': 24.98,
'best_ofr': 43.6,
'ofr_vwap': 43.6,
'avg_vwap': 38.945,
'date': datetime.date(2021, 1, 22)})]
#Create a dataframe of current in the order book
quote_df = (pd.DataFrame
.from_dict(quotes,orient='index')
.reset_index()
.rename(columns={'index':'timestamp'})
.dropna(subset=['best_bid','best_ofr'],how='all'))
display(quote_df)
| timestamp | bid | ofr | best_bid | bid_vwap | best_ofr | ofr_vwap | avg_vwap | date | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 2021-01-22 08:25:48.218283 | {44.45: 1000} | {} | 44.45 | 44.45 | NaN | NaN | 44.45000 | 2021-01-22 |
| 2 | 2021-01-22 08:44:23.578790 | {44.04: 1000} | {} | 44.04 | 44.04 | NaN | NaN | 44.04000 | 2021-01-22 |
| 4 | 2021-01-22 08:49:02.526710 | {} | {43.66: 100} | NaN | NaN | 43.66 | 43.66000 | 43.66000 | 2021-01-22 |
| 6 | 2021-01-22 08:49:28.189795 | {43.34: 100} | {} | 43.34 | 43.34 | NaN | NaN | 43.34000 | 2021-01-22 |
| 8 | 2021-01-22 08:51:11.786607 | {43.41: 100} | {} | 43.41 | 43.41 | NaN | NaN | 43.41000 | 2021-01-22 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 342613 | 2021-01-28 16:56:26.486624 | {} | {988.0: 112} | NaN | NaN | 988.00 | 988.00000 | 988.00000 | 2021-01-28 |
| 342614 | 2021-01-28 16:59:06.409716 | {} | {988.0: 112, 262.0: 100} | NaN | NaN | 262.00 | 645.54717 | 645.54717 | 2021-01-28 |
| 342615 | 2021-01-28 16:59:06.793899 | {} | {988.0: 112} | NaN | NaN | 988.00 | 988.00000 | 988.00000 | 2021-01-28 |
| 342616 | 2021-01-28 16:59:06.797557 | {} | {988.0: 112, 262.0: 100} | NaN | NaN | 262.00 | 645.54717 | 645.54717 | 2021-01-28 |
| 342617 | 2021-01-28 16:59:08.241677 | {} | {988.0: 112} | NaN | NaN | 988.00 | 988.00000 | 988.00000 | 2021-01-28 |
342023 rows × 9 columns
#Get just the trades from the original dataframe
trade_df = json_df.loc[json_df['type']=='trade_report',:]
trade_df
| type | timestamp | symbol | flags | size | price | trade_id | side | date | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | trade_report | 2021-01-22 08:02:31.215300 | GME | 96 | 39 | 45.19 | 2.067095e+06 | X | 2021-01-22 |
| 1 | trade_report | 2021-01-22 08:08:14.700160 | GME | 96 | 50 | 44.87 | 2.639914e+06 | X | 2021-01-22 |
| 2 | trade_report | 2021-01-22 08:11:52.294756 | GME | 96 | 50 | 44.58 | 3.063945e+06 | X | 2021-01-22 |
| 3 | trade_report | 2021-01-22 08:18:22.383301 | GME | 96 | 50 | 44.04 | 3.669247e+06 | X | 2021-01-22 |
| 4 | trade_report | 2021-01-22 08:19:09.002873 | GME | 96 | 10 | 43.78 | 3.739332e+06 | X | 2021-01-22 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 490754 | trade_report | 2021-01-28 16:54:05.085758 | GME | 96 | 5 | 257.58 | 3.026635e+09 | X | 2021-01-28 |
| 490755 | trade_report | 2021-01-28 16:54:05.592947 | GME | 96 | 22 | 257.58 | 3.026636e+09 | X | 2021-01-28 |
| 490759 | trade_report | 2021-01-28 16:56:26.486624 | GME | 224 | 1 | 264.00 | 3.026962e+09 | X | 2021-01-28 |
| 490762 | trade_report | 2021-01-28 16:59:06.793899 | GME | 192 | 100 | 262.00 | 3.027353e+09 | X | 2021-01-28 |
| 490765 | trade_report | 2021-01-28 16:59:08.241677 | GME | 96 | 19 | 262.00 | 3.027362e+09 | X | 2021-01-28 |
134712 rows × 9 columns
#Combine the two dataframes so that each trade report is associated with a set of 'prevailing quotes'
plot_df = (pd.concat([quote_df,trade_df],ignore_index=False)
.fillna({'type':'quote','symbol':'GME'})
.sort_values(['timestamp','type'])
.drop('side',axis=1)
.ffill()
.dropna(subset=['best_bid','best_ofr'])
.astype({'date':'string'}))
plot_df = plot_df.loc[plot_df['type']=='trade_report',:]
plot_df['midpt'] = (plot_df['best_bid']+plot_df['best_ofr'])/2
def make_plot(plot_df,date):
import plotly.graph_objects as go
daily_df = plot_df.loc[plot_df.date.astype('string')==date,:]
fig = go.Figure()
fig.add_trace(go.Scatter(x=daily_df['timestamp'],
y=daily_df['best_bid'],
mode='markers',
marker_color='blue',
opacity = 0.75,
line_shape='hv',
name = 'Best Bid'))
fig.add_trace(go.Scatter(x=daily_df['timestamp'],
y=daily_df['bid_vwap'],
mode='markers',
opacity=0.75,
marker_color='lightblue',
name='VWAP of Bids',
visible='legendonly'))
fig.add_trace(go.Scatter(x=daily_df['timestamp'],
y=daily_df['best_ofr'],
mode='markers',
marker_color='red',
opacity=0.75,
line_shape='hv',
name = 'Best Offer'))
fig.add_trace(go.Scatter(x=daily_df['timestamp'],
y=daily_df['ofr_vwap'],
mode='markers',
opacity=0.75,
marker_color='pink',
name='VWAP of offers',
visible='legendonly'))
fig.add_trace(go.Scatter(x=daily_df['timestamp'],
y=daily_df['price'],
mode='markers',
marker_color='green',
marker_symbol = 'x',
name = 'Trades'))
fig.add_trace(go.Scatter(x=daily_df['timestamp'],
y=daily_df['avg_vwap'],
mode = 'lines',
line_color = 'orange',
name='VWAP of Quotes'))
fig.update_layout(xaxis_tickformat = "%H:%M:%S.%f",
xaxis_title='Time',
yaxis_title='Price',
title='Quotes and Trades on IEX for '+date,
template='plotly_dark')
return fig
fig = make_plot(plot_df,"2021-01-22")
Image(fig.to_image(format="png"))
fig = make_plot(plot_df,"2021-01-25")
Image(fig.to_image(format="png"))
fig = make_plot(plot_df,"2021-01-26")
fig.add_annotation(x=pd.to_datetime('2021-01-26T16:07:00'),
y=150,
text="Gamestonk!",
showarrow=True,
arrowhead=4)
Image(fig.to_image(format="png"))
make_plot(plot_df,"2021-01-27")
Image(fig.to_image(format="png"))
make_plot(plot_df,"2021-01-28")
Image(fig.to_image(format="png"))
melt_df = plot_df.melt(id_vars=['timestamp','price','date'],
value_vars = ['avg_vwap','midpt'],
value_name='Quote',
var_name='Type')
melt_df
import plotly.express as px
fig = px.scatter(melt_df,
x='Quote',
y='price',
opacity = 0.3,
color='date',
facet_col='Type')
for attribute in fig.layout.annotations:
conversion_dict = {'Type=avg_vwap':'Average VWAP of Quotes',
'Type=midpt':'Midpoint of BBO'}
attribute.text = conversion_dict[attribute.text]
fig.update_layout(xaxis_title='Quote Price',
xaxis2_title='Quote Price',
yaxis_title='Trade Price',
legend_title='Date',
title='Actual Trade Price vs Quote Prices',
template='plotly_dark')
fig.add_shape(type="line",
x0=0, y0=0, x1=500, y1=500,
line=dict(color="white",width=3),
row='all',col='all');
fig.show()
img_bytes = fig.to_image(format="png")
Image(img_bytes)